cor2cov = function(R, std) {
  Sigma = outer(std, std) * R
  return(Sigma)
}

get_upper_tri = function(cormat){
    cormat[lower.tri(cormat)] = NA
    diag(cormat) = NA
    return(cormat)
}

p_dens = function(p_mat) {
  p_vec = p_mat[upper.tri(p_mat)]
  
  df_line = data.frame(xintercept = c(0.01, 0.05), 
                       Lines = c("p = 0.01", "p = 0.05"),
                       stringsAsFactors = FALSE)
  
  fig_p = data.frame(p = p_vec) %>%
    ggplot(aes(x = p)) +
    stat_ecdf(geom = "point") +
    geom_vline(aes(xintercept = xintercept, color = Lines), df_line, 
               linetype = "dashed", size = 1) +
    labs(x = "P-value", y = "Cumulative density") +
    scale_color_discrete(name = NULL) +
    theme_bw()
  return(fig_p)
}

p_filter = function(mat, mat_p, max_p){
  ind_p = mat_p
  ind_p[mat_p > max_p] = 0
  ind_p[mat_p <= max_p] = 1
  
  mat_filter = mat * ind_p
  return(mat_filter)
}

hard_thresh = function(R, th){
  R_th = R
  R_th[abs(R) <= th] = 0
  return(R_th)
}

1. Data generation

1.1 Absolute abundances

set.seed(123)
n = 50
d = 100
mu = c(rep(NA, 4), # Taxa with correlations
       10000, 10000, # High abundant taxa
       sample(c(200, 50), d - 6, replace = TRUE, prob = c(0.8, 0.2))) # Low abundant taxa
# NB distribution
dispersion = 0.5

# Absolute abundances
A = matrix(NA, ncol = n, nrow = d)
for (i in seq.int(from = 5, to = d)) {
  A[i, ] = rnbinom(n = n, size = 1/dispersion, mu = mu[i])
}

# Dependent pairs of taxa
abn_mean = c(2000, 2000)
template_var = log((abn_mean + dispersion * abn_mean^2)/abn_mean^2 + 1)
template_mean = log(abn_mean) - template_var/2

Sigma = cor2cov(R = diag(1, nrow = 2), std = sqrt(template_var))
template = t(MASS::mvrnorm(n = n, mu = template_mean, Sigma = Sigma))
template1 = template[1, ]
template2 = template[2, ]
poly1 = template_mean[1] * poly(x = template1, degree = 1, raw = FALSE)[, 1] + template_mean[1]
poly2 = template_mean[2] * poly(x = template2, degree = 2, raw = FALSE)[, 2] + template_mean[2]
A_dep = round(exp(rbind(template, poly1, poly2)))

for (i in 1:4) {
  A[i, ] = A_dep[i, ]
}
mu[1:2] = 2000
mu[3:4] = 10000

taxa_id = paste0("T", seq_len(d))
sample_id = paste0("S", seq_len(n))
rownames(A) = taxa_id
colnames(A) = sample_id

p_a = data.frame(t(log(A[1:5, ]))) %>% 
  ggpairs(title = "Synthetic Log Absolute Abundances",
          progress = FALSE,
          upper = NULL, diag = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        strip.background = element_rect(fill = "white"))
print(p_a)

1.2 Observed abundances

# Sequencing efficiency
C = rbeta(n = d, shape1 = 5, shape2 = 5)

# Microbial loads in the ecosystem
A_prim = A * C
A_dot = colSums(A_prim)

# Relative abundances in the ecosystem
R = A_prim/t(replicate(d, A_dot))

# Sampling fractions
S = rbeta(n = n, shape1 = 2, shape2 = 10)

# Library sizes
O_dot = round(S * A_dot)
# Observed abundances
O = matrix(NA, nrow = d, ncol = n)
for (i in seq(n)) {
  O[, i] = rmultinom(1, size = O_dot[i], prob = R[, i])
}
rownames(O) = taxa_id
colnames(O) = sample_id

# Random errors
E = O/(A * C * t(replicate(d, S)))

# Log scale parameters
a = log(A)
c = log(C)
s = log(S)
e = log(E)
# Log scale variables
o = log(O)
r = O/t(replicate(d, O_dot))

p_o = data.frame(t(log(O[1:5, ]))) %>% 
  ggpairs(title = "Synthetic Log Observed Abundances",
          progress = FALSE, upper = NULL, diag = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        strip.background = element_rect(fill = "white"))
print(p_o)

1.3 Example: taxon 1 vs. taxon 2

# Absolute abundance
A_t12 = data.frame(t(A[1:2, ]))
ls_cor_test = cor.test(x = A_t12$T1, y = A_t12$T2, method = "pearson")
df_ann = data.frame(x = 4000, y = 4000) %>%
  mutate(label = paste0("r = ", round(ls_cor_test$estimate, 2),
                        "\n", "p = ", round(ls_cor_test$p.value, 2)))
p_A_t12 = A_t12 %>%
  ggplot(aes(x = T1, y = T2)) +
  geom_point(size = 4, alpha = 0.6) +
  labs(title = "Absolute Abundances for T1 and T2") +
  geom_label(data = df_ann, aes(x = x, y = y, label = label), 
             size = 6, vjust = -0.5, hjust = 0, color = "black") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

# Observed abundance
O_t12 = data.frame(t(O[1:2, ]))
ls_cor_test = cor.test(x = O_t12$T1, y = O_t12$T2, method = "pearson")
df_ann = data.frame(x = 600, y = 710) %>%
  mutate(label = paste0("r = ", round(ls_cor_test$estimate, 2),
                        "\n", "p = ", round(ls_cor_test$p.value, 2)))
p_O_t12 = O_t12 %>%
  ggplot(aes(x = T1, y = T2)) +
  geom_point(size = 4, alpha = 0.6) +
  labs(title = "Observed Abundances for T1 and T2") +
  geom_label(data = df_ann, aes(x = x, y = y, label = label), 
             size = 6, vjust = -0.5, hjust = 0, color = "black") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

ggarrange(p_A_t12, p_O_t12, ncol = 2)

2. Standard Pearson correlation

2.1 Correlation matrix

o[is.infinite(o)] = NA
res_sample = Hmisc::rcorr(t(o), type = "pearson")
corr_sample = res_sample$r
corr_sample_p = res_sample$P
diag(corr_sample_p) = 0

corr_sample_fl = p_filter(corr_sample, corr_sample_p, max_p = 0.005)

df = corr_sample_fl[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_sample = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "Standard Pearson") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_sample

2.2 P-value

p_dense_sample = p_dens(corr_sample_p)
p_dense_sample = p_dense_sample + 
  labs(title = "Standard Pearson") +
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p_dense_sample)

3. SparCC

corr_sparcc = sparcc(t(O))$Cor
colnames(corr_sparcc) = rownames(O)
rownames(corr_sparcc) = rownames(O)

corr_sparcc_th = hard_thresh(corr_sparcc, th = 0.3)

df = corr_sparcc_th[1:5, 1:5]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_sparcc = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "SparCC") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_sparcc

4. SECOM

OTU = otu_table(O, taxa_are_rows = TRUE)
meta_data = data.frame(sample_id = sample_id)
META = sample_data(meta_data)
sample_names(META) = meta_data$sample_id
otu_data = phyloseq(OTU, META)

pseqs = list(c(otu_data, otu_data))
pseudo = 0; prv_cut = 0.5; lib_cut = 1000; corr_cut = 0.5
wins_quant = c(0.05, 0.95); method = "pearson"; soft = FALSE; thresh_len = 20
n_cv = 10; thresh_hard = 0; max_p = 0.005; n_cl = 2

set.seed(123)
res_linear = secom_linear(pseqs, pseudo, prv_cut, lib_cut, corr_cut, 
                          wins_quant, method, soft, thresh_len, n_cv, 
                          thresh_hard, max_p, n_cl)

R = 1000; max_p = 0.005
set.seed(123)
res_dist = secom_dist(pseqs, pseudo, prv_cut, lib_cut, corr_cut,
                      wins_quant, R, thresh_hard, max_p, n_cl)

4.1 Bias-corrected abundance estimation

y_hat = res_linear$y_hat

p_y_hat = data.frame(t(y_hat[1:5, ])) %>% 
  ggpairs(title = "Bias-Corrected Abundances", 
          progress = FALSE, upper = NULL, diag = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        strip.background = element_rect(fill = "white"))

print(p_y_hat)

4.2 Linear correlation

4.21 Thresholding

row_ind = which(rownames(res_linear$corr_th) %in% paste0("T", 1:5))
col_ind = which(colnames(res_linear$corr_th) %in% paste0("T", 1:5))
df = res_linear$corr_th[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_secom1 = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "SECOM (Pearson1)") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_secom1

4.22 Filtering

df = res_linear$corr_fl[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_secom2 = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "SECOM (Pearson2)") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_secom2

4.23 P-value

p_dense_secom1 = p_dens(res_linear$corr_p)
p_dense_secom1 = p_dense_secom1 + 
  labs(title = "SECOM (Pearson2)") +
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p_dense_secom1)

4.3 Distance correlation

4.31 Filtering

df = res_dist$dcorr_fl[row_ind, col_ind]
df_p = data.frame(get_upper_tri(df)) %>%
  rownames_to_column("var1") %>%
  pivot_longer(cols = T1:T5, names_to = "var2", values_to = "value") %>%
  filter(!is.na(value)) %>%
  mutate(value = round(value, 2))
df_p$var1 = factor(df_p$var1, levels = paste0("T", 1:5))
df_p$var2 = factor(df_p$var2, levels = paste0("T", 1:5))

p_secom3 = df_p %>%
  ggplot(aes(var1, var2, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = NULL) +
  scale_x_discrete(drop = FALSE, position = "top") +
  scale_y_discrete(drop = FALSE, limits = rev, position = "right") +
  geom_text(aes(var1, var2, label = value), color = "black", size = 4) +
  labs(x = NULL, y = NULL, title = "SECOM (Distance)") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12),
        plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(),
        legend.position = c(0.7, 0.8),
        legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "top", title.hjust = 0.5)) +
  coord_fixed()

p_secom3

4.32 P-value

p_dense_secom2 = p_dens(res_dist$dcorr_p)
p_dense_secom2 = p_dense_secom2 + 
  labs(title = "SECOM (Distance)") +
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(p_dense_secom2)

5. Outputs

p_main1 = ggarrange(ggmatrix_gtable(p_a), ggmatrix_gtable(p_o), 
                    ncol = 2, nrow = 1, labels = c("a", "b"))
p_main2 = ggarrange(p_sample, p_sparcc, p_secom3, ncol = 3,
                    labels = c("c", "d", "e"), common.legend = TRUE, 
                    legend = "bottom")
p_main = ggarrange(p_main1, p_main2, ncol = 1, nrow = 2)
ggsave(plot = p_main, "../images/main/illustrate.pdf", height = 12, width = 12)   
ggsave(plot = p_main, "../images/main/illustrate.jpeg", height = 12, width = 12, dpi = 300)

df_p = data.frame(p = corr_sample_p[upper.tri(corr_sample_p)], method = "Pearson") %>%
  bind_rows(
    data.frame(p = res_dist$dcorr_p[upper.tri(res_dist$dcorr_p)], method = "SECOM (Distance)")
  )
df_line = data.frame(xintercept = c(0.01, 0.05),
                     stringsAsFactors = FALSE)
p_supp = df_p %>%
  ggplot(aes(x = p, color = method)) +
  scale_color_npg(name = NULL) +
  stat_ecdf(geom = "point") +
  geom_vline(xintercept = 0.005, 
             linetype = "dashed", size = 1) +
  labs(x = "P-value", y = "Cumulative density") +
  scale_x_continuous(breaks = c(0.005, 0.25, 0.50, 0.75, 1.00)) +
  theme_bw()
p_supp

ggsave(plot = p_supp, "../images/supp/supp_p_value.pdf", height = 5, width = 6.25)   
ggsave(plot = p_supp, "../images/supp/supp_p_value.jpeg", height = 5, width = 6.25, dpi = 300)

Session information

sessionInfo()
R version 4.2.0 (2022-04-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur/Monterey 10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggsci_2.9         ggpubr_0.4.0      SpiecEasi_1.1.2   plotly_4.10.0    
 [5] GGally_2.1.2      ANCOMBC_1.6.2     microbiome_1.18.0 phyloseq_1.40.0  
 [9] forcats_0.5.1     stringr_1.4.0     dplyr_1.0.9       purrr_0.3.4      
[13] readr_2.1.2       tidyr_1.2.0       tibble_3.1.7      ggplot2_3.3.6    
[17] tidyverse_1.3.1  

loaded via a namespace (and not attached):
  [1] readxl_1.4.0           backports_1.4.1        Hmisc_4.7-0           
  [4] VGAM_1.1-6             plyr_1.8.7             igraph_1.3.2          
  [7] lazyeval_0.2.2         splines_4.2.0          crosstalk_1.2.0       
 [10] GenomeInfoDb_1.32.2    digest_0.6.29          foreach_1.5.2         
 [13] htmltools_0.5.2        fansi_1.0.3            magrittr_2.0.3        
 [16] checkmate_2.1.0        cluster_2.1.3          doParallel_1.0.17     
 [19] tzdb_0.3.0             Biostrings_2.64.0      modelr_0.1.8          
 [22] jpeg_0.1-9             colorspace_2.0-3       rvest_1.0.2           
 [25] haven_2.5.0            rbibutils_2.2.8        xfun_0.31             
 [28] crayon_1.5.1           RCurl_1.98-1.7         jsonlite_1.8.0        
 [31] Exact_3.1              survival_3.3-1         iterators_1.0.14      
 [34] ape_5.6-2              glue_1.6.2             gtable_0.3.0          
 [37] zlibbioc_1.42.0        XVector_0.36.0         car_3.1-0             
 [40] Rhdf5lib_1.18.2        shape_1.4.6            BiocGenerics_0.42.0   
 [43] abind_1.4-5            scales_1.2.0           mvtnorm_1.1-3         
 [46] DBI_1.1.3              rngtools_1.5.2         rstatix_0.7.0         
 [49] Rcpp_1.0.8.3           viridisLite_0.4.0      htmlTable_2.4.0       
 [52] foreign_0.8-82         proxy_0.4-27           Formula_1.2-4         
 [55] stats4_4.2.0           glmnet_4.1-4           htmlwidgets_1.5.4     
 [58] httr_1.4.3             pulsar_0.3.7           RColorBrewer_1.1-3    
 [61] ellipsis_0.3.2         farver_2.1.0           pkgconfig_2.0.3       
 [64] reshape_0.8.9          nnet_7.3-17            sass_0.4.1            
 [67] dbplyr_2.2.0           utf8_1.2.2             labeling_0.4.2        
 [70] tidyselect_1.1.2       rlang_1.0.2            reshape2_1.4.4        
 [73] munsell_0.5.0          cellranger_1.1.0       tools_4.2.0           
 [76] cli_3.3.0              generics_0.1.2         ade4_1.7-19           
 [79] broom_0.8.0            evaluate_0.15          biomformat_1.24.0     
 [82] fastmap_1.1.0          yaml_2.3.5             knitr_1.39            
 [85] fs_1.5.2               rootSolve_1.8.2.3      nlme_3.1-158          
 [88] doRNG_1.8.2            xml2_1.3.3             compiler_4.2.0        
 [91] rstudioapi_0.13        png_0.1-7              ggsignif_0.6.3        
 [94] e1071_1.7-11           huge_1.3.5             reprex_2.0.1          
 [97] bslib_0.3.1            DescTools_0.99.45      stringi_1.7.6         
[100] gsl_2.1-7.1            highr_0.9              lattice_0.20-45       
[103] Matrix_1.4-1           nloptr_2.0.3           vegan_2.6-2           
[106] permute_0.9-7          multtest_2.52.0        vctrs_0.4.1           
[109] pillar_1.7.0           lifecycle_1.0.1        rhdf5filters_1.8.0    
[112] Rdpack_2.3.1           jquerylib_0.1.4        cowplot_1.1.1         
[115] data.table_1.14.2      bitops_1.0-7           lmom_2.9              
[118] R6_2.5.1               latticeExtra_0.6-29    gridExtra_2.3         
[121] IRanges_2.30.0         gld_2.6.4              codetools_0.2-18      
[124] boot_1.3-28            energy_1.7-10          MASS_7.3-57           
[127] assertthat_0.2.1       rhdf5_2.40.0           withr_2.5.0           
[130] S4Vectors_0.34.0       GenomeInfoDbData_1.2.8 mgcv_1.8-40           
[133] expm_0.999-6           parallel_4.2.0         hms_1.1.1             
[136] grid_4.2.0             rpart_4.1.16           class_7.3-20          
[139] rmarkdown_2.14         carData_3.0-5          Rtsne_0.16            
[142] Biobase_2.56.0         lubridate_1.8.0        base64enc_0.1-3